Dependencies

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.3.0     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
library(klaR)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
library(ggpubr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
data.tb <- read_csv("./data/BlackFriday.csv")
## Parsed with column specification:
## cols(
##   User_ID = col_double(),
##   Product_ID = col_character(),
##   Gender = col_character(),
##   Age = col_character(),
##   Occupation = col_double(),
##   City_Category = col_character(),
##   Stay_In_Current_City_Years = col_character(),
##   Marital_Status = col_double(),
##   Product_Category_1 = col_double(),
##   Product_Category_2 = col_double(),
##   Product_Category_3 = col_double(),
##   Purchase = col_double()
## )
#data.tb %>% head(25)

data_random.tb <- sample_n(data.tb, 2000, replace=TRUE)

#data_random_unique.tb <- unique(data_random.tb)

#test.tb <- data.tb %>% head(2000)

find unique values for Age

unique(data_random.tb$Age)  
## [1] "46-50" "18-25" "26-35" "51-55" "36-45" "55+"   "0-17"

functions to filter Ages into return values (numeric representations of the group)

ageFilter <- function(age) {
  switch(age, "0-17"=1, "18-25"=2, "26-35"=3, "36-45"=4, "46-50"=5, "51-55"=6, "55+"=7)
}
ageFilter("51-55")
## [1] 6

pipe Age vector into ageFilter(), append to tibble

##data.tb$age <- ageFilter(data.tb$Age)
data_random.tb$age <- data_random.tb$Age

#data.tb %>% head()
for (i in 1:length(data_random.tb$age)) {
  data_random.tb$age[i] <- ageFilter(data_random.tb$age[i])
  ##print(test.tb$Age + " " + test.tb$age)
}

#make numeric
data_random.tb$age <- data_random.tb$age %>% as.numeric()

Regression: Age vs Purchase Value

fit <- lm(formula= data_random.tb$Purchase ~ data_random.tb$age + 0, data=data_random.tb)
fit[1]
## $coefficients
## data_random.tb$age 
##           2273.659

Correlation: Age vs Purchase Value

cor(data_random.tb$age %>% as.numeric(), data_random.tb$Purchase %>% as.numeric())
## [1] -0.0177134
anova <- aov(data_random.tb$Purchase ~ data_random.tb$Age, data = data_random.tb)
summary(anova)
##                      Df    Sum Sq  Mean Sq F value Pr(>F)  
## data_random.tb$Age    6 2.827e+08 47118284   1.917 0.0746 .
## Residuals          1993 4.899e+10 24579562                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data_random_na.tb <- na.omit(data_random.tb)

plot <- plot_ly(
  x = c(data_random_na.tb$Age),
  y = c(data_random_na.tb$Purchase),
  name = "Age vs. Purchase Amount",
  type = "bar"
)

plot
ggboxplot(data_random_na.tb, x = "Age", y = "Purchase", 
          order = c("0-17", "18-25", "26-35", "36-45", "46-50", "51-55", "55+" ),
          ylab = "Money Spent", xlab = "Age")